By: Lawrence (Jake) Klinkert and Hongjin (Tony) Yu
CS8321 Sect 001 1222
3/27/2022
Submission Details: Turn in the rendered jupyter notebook (exported as HTML) to canvas. Only one notebook per team is required, but team names must be on the assignment.
If using code from another author (not your own), you will be graded on the clarity of explanatory comments you add to the code.
from PIL import Image
import requests
TRAINING = True
EVALUATE = True
In this lab you will implement a style transfer algorithm with whitening and coloring transformations, using the work of Li et al. in their universal style transfer paper. An example implementation of training a decoder for different scales of VGG has been implemented for you to build from (https://github.com/8000net/universal-style-transfer-keras to an external site.). However, you will be manipulating the code to work properly. Also, the code should be updated to work with the newest version of Keras/Tensorflow. As always, you can choose a PyTorch implementation if you prefer.
[2 Points] Look at the above decoder code and describe any errors (or imperfections) in the upsampling layers. Discuss how these errors could be fixed. Now implement these solutions either by updating the code or rewriting the implementation. Remember that you should use strided convolutions without any pooling steps (i.e., do NOT use unpooling). You can also use another network besides VGG if you desire (such as UNET, for example). If using a new network, be sure it uses only feedforward connections (for simplicity).
from keras.layers import Input, Conv2D, UpSampling2D
def decoder_layers(inputs, layer):
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='decoder_block5_conv1')(inputs)
if layer == 1:
return x
x = UpSampling2D((2, 2), name='decoder_block4_upsample')(x)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='decoder_block4_conv4')(x)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='decoder_block4_conv3')(x)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='decoder_block4_conv2')(x)
x = Conv2D(512, (3, 3), activation='relu', padding='same', name='decoder_block4_conv1')(x)
if layer == 2:
return x
x = UpSampling2D((2, 2), name='decoder_block3_upsample')(x)
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='decoder_block3_conv4')(x)
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='decoder_block3_conv3')(x)
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='decoder_block3_conv2')(x)
x = Conv2D(256, (3, 3), activation='relu', padding='same', name='decoder_block3_conv1')(x)
if layer == 3:
return x
x = UpSampling2D((2, 2), name='decoder_block2_upsample')(x)
x = Conv2D(128, (3, 3), activation='relu', padding='same', name='decoder_block2_conv2')(x)
x = Conv2D(128, (3, 3), activation='relu', padding='same', name='decoder_block2_conv1')(x)
if layer == 4:
return x
x = UpSampling2D((2, 2), name='decoder_block1_upsample')(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same', name='decoder_block1_conv2')(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same', name='decoder_block1_conv1')(x)
if layer == 5:
return x
Image.open(requests.get("https://www.researchgate.net/profile/Arsal-Syed-2/publication/335495371/figure/fig2/AS:892800157638656@1589871553065/SegNets-encoder-decoder-architecture-based-off-VGG16-with-fully-connected-layers-removed.ppm", stream=True).raw)
Image of VGG16 encoder-decoder architecture
From the above code and figure showing the architecture of VGG16, we can determine the correct structure of the decoder. For Major errors with the code, the decoder is not a reflection of the encoder. For start, the layering order is reversed in the decoder, and does not capture an entire block. Second, the number of convolutional layers per block are wrong, it should be 4, 4, 4, 2, 2. There is no softmax activation at the end of the decoder. Finally, a minor issue is that if the user does not provide a layer that is between 1 and 5, then the model will not return anything.
# decoder.py
def decoder_layers(inputs, layer):
if layer == 5:
x = vgg_block_reverse(layer_in=inputs, n_filter=512, n_conv=4, block_number=5, trainable=True)
x = vgg_block_reverse(layer_in=x, n_filter=512, n_conv=4, block_number=4, trainable=True)
x = vgg_block_reverse(layer_in=x, n_filter=256, n_conv=4, block_number=3, trainable=True)
x = vgg_block_reverse(layer_in=x, n_filter=128, n_conv=2, block_number=2, trainable=True)
x = vgg_block_reverse(layer_in=x, n_filter=64, n_conv=2, block_number=1, trainable=True)
return x
if layer == 4:
x = vgg_block_reverse(layer_in=inputs, n_filter=512, n_conv=4, block_number=4, trainable=True)
x = vgg_block_reverse(layer_in=x, n_filter=256, n_conv=4, block_number=3, trainable=True)
x = vgg_block_reverse(layer_in=x, n_filter=128, n_conv=2, block_number=2, trainable=True)
x = vgg_block_reverse(layer_in=x, n_filter=64, n_conv=2, block_number=1, trainable=True)
return x
if layer == 3:
x = vgg_block_reverse(layer_in=inputs, n_filter=256, n_conv=4, block_number=3, trainable=True)
x = vgg_block_reverse(layer_in=x, n_filter=128, n_conv=2, block_number=2, trainable=True)
x = vgg_block_reverse(layer_in=x, n_filter=64, n_conv=2, block_number=1, trainable=True)
return x
if layer == 2:
x = vgg_block_reverse(layer_in=inputs, n_filter=128, n_conv=2, block_number=2, trainable=True)
x = vgg_block_reverse(layer_in=x, n_filter=64, n_conv=2, block_number=1, trainable=True)
return x
if layer == 1:
x = vgg_block_reverse(layer_in=inputs, n_filter=64, n_conv=2, block_number=1, trainable=True)
return x
#vgg.py
from keras.models import Model
from keras.layers import Conv2D, MaxPooling2D, Input, UpSampling2D, BatchNormalization
from keras.utils.data_utils import get_file
import keras.backend as K
import h5py
import numpy as np
import tensorflow as tf
WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5'
MEAN_PIXEL = np.array([103.939, 116.779, 123.68])
WEIGHTS_PATH = get_file('vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5',
WEIGHTS_PATH_NO_TOP,
cache_subdir='models',
file_hash='253f8cb515780f3b799900260a226db6')
def vgg_layers(inputs, target_layer):
# Block 1
x = vgg_block(inputs, n_filter=64, n_conv=2, block_number=1, trainable=False)
if target_layer == 1:
return x
# Block 2
x = vgg_block(x, n_filter=128, n_conv=2, block_number=2, trainable=False)
if target_layer == 2:
return x
# Block 3
x = vgg_block(x, n_filter=256, n_conv=4, block_number=3, trainable=False)
if target_layer == 3:
return x
# Block 4
x = vgg_block(x, n_filter=512, n_conv=4, block_number=4, trainable=False)
if target_layer == 4:
return x
# Block 5
x = vgg_block(x, n_filter=512, n_conv=4, block_number=5, trainable=False)
return x
def vgg_block(layer_in, n_filter, n_conv, block_number, trainable=True):
for conv_num in range(n_conv):
layer_in = Conv2D(n_filter, (3, 3), activation='relu', padding='same', name=f'encode_block{block_number}_conv{conv_num}')(layer_in)
layer_in.trainable = trainable
layer_in = BatchNormalization()(layer_in)
layer_in.trainable = trainable
layer_in = MaxPooling2D((2, 2), strides=(2, 2), name=f'encode_block{block_number}_pool')(layer_in)
layer_in.trainable = trainable
layer_in = BatchNormalization()(layer_in)
layer_in.trainable = trainable
return layer_in
def vgg_block_reverse(layer_in, n_filter, n_conv, block_number, trainable=True):
layer_in = UpSampling2D((2, 2), name=f'decode_block{block_number}_upsample')(layer_in)
layer_in.trainable = trainable
layer_in = BatchNormalization()(layer_in)
layer_in.trainable = trainable
for conv_num in range(n_conv):
layer_in = Conv2D(n_filter, (3, 3), activation='relu', padding='same', name=f'decode_block{block_number}_conv{conv_num}')(layer_in)
layer_in.trainable = trainable
layer_in = BatchNormalization()(layer_in)
layer_in.trainable = trainable
return layer_in
def load_weights(model):
f = h5py.File(WEIGHTS_PATH, 'r')
layer_names = [name for name in f.attrs['layer_names']]
for layer in model.layers:
b_name = layer.name.encode()
if b_name in layer_names:
g = f[b_name]
weights = [g[name] for name in g.attrs['weight_names']]
layer.set_weights(weights)
layer.trainable = False
f.close()
def VGG19(input_tensor=None, input_shape=None, target_layer=1):
"""
VGG19, up to the target layer (1 for relu1_1, 2 for relu2_1, etc.)
"""
if input_tensor is None:
inputs = Input(shape=input_shape)
else:
inputs = Input(tensor=input_tensor, shape=input_shape)
model = Model(inputs, vgg_layers(inputs, target_layer), name='vgg19')
load_weights(model)
return model
def preprocess_input(x):
# Convert 'RGB' -> 'BGR'
if type(x) is np.ndarray:
x = x[..., ::-1]
else:
x = tf.reverse(x, [-1])
return x - MEAN_PIXEL
#model.py
from keras.models import Model, Sequential, load_model
from keras.layers import Conv2D, Input
import keras.backend as K
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
LAMBDA = 1.0
def l2_loss(x):
return K.sum(K.square(x)) / 2
class EncoderDecoder:
def __init__(self, input_shape=(256, 256, 3), target_layer=5, decoder_path=None):
self.input_shape = input_shape
self.target_layer = target_layer
self.encoder = VGG19(input_shape=input_shape, target_layer=target_layer)
self.history = None
if decoder_path:
decoder_name = f"decoder_{target_layer}_model.h5"
decoder_history = f"decoder_{target_layer}_history.npy"
decoder_full_path = os.path.join(decoder_path, decoder_name)
history_full_path = os.path.join(decoder_path, decoder_history)
self.decoder = load_model(decoder_full_path)
self.history = np.load(history_full_path, allow_pickle=True).item()
else:
self.decoder = self.create_decoder(target_layer)
self.inputs = Input(self.input_shape)
print("self.inputs: ")
print(self.inputs)
inputs = Input(shape=self.input_shape, name='img_input')
code = self.encoder(inputs)
reconstruction = self.decoder(code)
self.model = Model(inputs, reconstruction)
self.model.compile(loss=self.loss_function, optimizer='adamax')
print(self.model.summary())
def loss_function(self, y_true, y_pred):
encoding_in = self.encoder(y_true)
encoding_out = self.encoder(y_pred)
# L2 normalization before L2 loss
en_in_norm = encoding_in / K.sqrt(K.sum(K.pow(encoding_in, 2)))
en_out_norm = encoding_out / K.sqrt(K.sum(K.pow(encoding_out, 2)))
y_pred_norm = y_pred / K.sqrt(K.sum(K.pow(y_pred, 2)))
y_true_norm = y_true / K.sqrt(K.sum(K.pow(y_true, 2)))
return l2_loss(y_pred_norm - y_true_norm) + LAMBDA * l2_loss(en_out_norm - en_in_norm)
def create_decoder(self, target_layer):
inputs = Input(shape=self.encoder.output_shape[1:])
print(inputs)
layers = decoder_layers(inputs, target_layer)
print("decoder layer is:")
print(layers)
output = Conv2D(3, (3, 3), activation='Softmax', padding='same', name='decoder_out')(layers)
print("output:")
print(output)
# updating model and loss for decoder
model = Model(inputs, output, name='decoder_%s' % target_layer)
# model.add_loss(self.loss)
return model
def Fit_Dataset(self, x, y, epochs, verbose):
self.history = self.model.fit(x=x, y=y, epochs=epochs, verbose=verbose)
def Fit_Generator(self, gen, train_steps, test_steps, epochs, verbose):
self.history = self.model.fit_generator(gen[0], steps_per_epoch=train_steps, validation_data = gen[1], validation_steps = test_steps, epochs=epochs, verbose=verbose)
def export_decoder(self, rel_dir):
cwd = os.getcwd()
file_model_name = f'decoder_{self.target_layer}_model.h5'
full_path = os.path.join(cwd, rel_dir)
model_full_path = os.path.join(full_path, file_model_name)
self.decoder.save(model_full_path)
file_history_name = f'decoder_{self.target_layer}_history.npy'
history_full_path = os.path.join(full_path, file_history_name)
np.save(history_full_path, self.history.history)
def Plot_History(self):
plt.title("Decoder Loss")
plt.plot(self.history['loss'], color="red")
#plt.plot(self.history['val_loss'], color="blue")
plt.xlabel("Epoch")
plt.show()
#train.py
import sys
import os
from skimage.util import random_noise
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # To run on CPU, uncomment this line
import numpy as np
from keras.preprocessing import image
from keras.callbacks import Callback
from PIL import Image
import cv2
#from model import EncoderDecoder
import zipfile
ZIP_PATH = 'data/RafD_resized.zip'
UNZIP_PATH = 'data/'
TARGET_LAYER = 1
IMAGE_PATH = "data/RafD_test"
#IMAGES_PATH = 'data/RafD_train'
TARGET_SIZE = (256, 256)
BATCH_SIZE = 8
EPOCHS = 12
cwd = os.getcwd()
def create_gen(train_img_dir, target_size, batch_size, validation_split=0.2):
datagen = image.ImageDataGenerator(rescale=1./255, validation_split=validation_split)
train_gen = datagen.flow_from_directory(directory=train_img_dir, target_size=target_size, batch_size=batch_size, color_mode='rgb', class_mode="input", shuffle = True, interpolation = "bilinear", subset='training', seed = 123)
validation_gen = datagen.flow_from_directory(directory=train_img_dir, target_size=target_size, batch_size=batch_size, color_mode='rgb', class_mode="input", shuffle = True, interpolation = "bilinear", subset='validation', seed = 123)
def train_tuple_gen():
for img in train_gen:
#if img != batch_size:
# continue
# (X, y)
yield img
def val_tuple_gen():
for img in validation_gen:
#if img != batch_size:
# continue
# (X, y)
noise_img = random_noise(img[0], mode='s&p',amount=0.1)
noise_img = np.array(255*noise_img, dtype = 'uint8')
yield (noise_img, img[1])
return train_tuple_gen(), val_tuple_gen()
# This needs to be in scope where model is defined
class OutputPreview(Callback):
def __init__(self, model, test_img_path, increment, preview_dir_path):
# test_img = image.load_img(test_img_path)
# test_img = imresize(test_img, (256, 256, 3))
text_img = cv2.imread(test_img_path, 1)
# text_img_resize = cv2.resize(src=text_img, dsize=(256, 256))
text_img_resize = cv2.resize(src=text_img, dsize=(256, 256), interpolation=cv2.INTER_CUBIC)
test_target = image.img_to_array(text_img_resize)
test_target = np.expand_dims(test_target, axis=0)
self.test_img = test_target
self.model = model
self.preview_dir_path = preview_dir_path
self.increment = increment
self.iteration = 0
def on_batch_end(self, batch, logs={}):
if self.iteration % self.increment == 0:
output_img = self.model.predict(self.test_img)[0]
fname = '%d.jpg' % self.iteration
out_path = os.path.join(self.preview_dir_path, fname)
#imwrite(out_path, output_img)
im = Image.fromarray(output_img)
im.save(out_path)
self.iteration += 1
if TRAINING:
zip_full_path = os.path.join(cwd, ZIP_PATH)
unzip_full_path = os.path.join(cwd, UNZIP_PATH)
with zipfile.ZipFile(zip_full_path, 'r') as zip_ref:
zip_ref.extractall(unzip_full_path)
image_full_path = os.path.join(cwd, IMAGE_PATH)
gen = create_gen(image_full_path, TARGET_SIZE, BATCH_SIZE)
#original_images = np.asarray(load_images_from_folder(IMAGE_PATH), dtype=np.float16)
#image_full_path = os.path.join(cwd, IMAGE_PATH)
#original_image = image.load_img(image_full_path, target_size=(256, 256, 3))
#original_array = image.img_to_array(original_image)
#print(original_array.shape)
#input_images = np.expand_dims(original_array, axis=0)
#print(input_images.shape)
#num_samples = 4752
num_train_samples = np.round(72.0 * 0.8)
num_test_samples = np.round(72.0 * 0.2)
train_steps_per_epoch = num_train_samples // BATCH_SIZE
test_steps_per_epoch = num_test_samples // BATCH_SIZE
# argv[1] is the layer we wish to train [1, 5]
#target_layer = int(sys.argv[1])
target_layer = TARGET_LAYER
encoder_decoder = EncoderDecoder(target_layer=target_layer)
# encoder_decoder.model.fit(gen, steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks)
#encoder_decoder.model.fit(x=input_images, y=input_images, epochs=EPOCHS, verbose=1)
# encoder_decoder.export_decoder()
#encoder_decoder.Fit_Dataset(x=input_images, y=input_images, epochs=EPOCHS, verbose=1)
encoder_decoder.Fit_Generator(gen, train_steps_per_epoch, test_steps_per_epoch, epochs=EPOCHS, verbose=1)
#callbacks = [OutputPreview(encoder_decoder, IMAGE_PATH, 5000, './preview-%d' % target_layer)]
#encoder_decoder.model.fit_generator(x=gen, steps_per_epoch=steps_per_epoch, epochs=EPOCHS, verbose=1)
encoder_decoder.export_decoder('models/')
print("finished!")
Found 58 images belonging to 1 classes.
Found 14 images belonging to 1 classes.
KerasTensor(type_spec=TensorSpec(shape=(None, 128, 128, 64), dtype=tf.float32, name='input_54'), name='input_54', description="created by layer 'input_54'")
decoder layer is:
KerasTensor(type_spec=TensorSpec(shape=(None, 256, 256, 64), dtype=tf.float32, name=None), name='batch_normalization_200/FusedBatchNormV3:0', description="created by layer 'batch_normalization_200'")
output:
KerasTensor(type_spec=TensorSpec(shape=(None, 256, 256, 3), dtype=tf.float32, name=None), name='decoder_out/softmax_16/Softmax:0', description="created by layer 'decoder_out'")
self.inputs:
KerasTensor(type_spec=TensorSpec(shape=(None, 256, 256, 3), dtype=tf.float32, name='input_55'), name='input_55', description="created by layer 'input_55'")
Model: "model_18"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
img_input (InputLayer) [(None, 256, 256, 3)] 0
vgg19 (Functional) (None, 128, 128, 64) 39488
decoder_1 (Functional) (None, 256, 256, 3) 76355
=================================================================
Total params: 115,843
Trainable params: 115,075
Non-trainable params: 768
_________________________________________________________________
None
Epoch 1/12
C:\Users\JAKEKL~1\AppData\Local\Temp/ipykernel_30280/462566863.py:75: UserWarning: `Model.fit_generator` is deprecated and will be removed in a future version. Please use `Model.fit`, which supports generators. self.history = self.model.fit_generator(gen[0], steps_per_epoch=train_steps, validation_data = gen[1], validation_steps = test_steps, epochs=epochs, verbose=verbose)
7/7 [==============================] - 48s 7s/step - loss: 0.3560 - val_loss: 0.5908 Epoch 2/12 7/7 [==============================] - 36s 6s/step - loss: 0.1862 - val_loss: 0.6038 Epoch 3/12 7/7 [==============================] - 39s 6s/step - loss: 0.1611 - val_loss: 0.6107 Epoch 4/12 7/7 [==============================] - 39s 5s/step - loss: 0.1578 - val_loss: 0.6298 Epoch 5/12 7/7 [==============================] - 37s 5s/step - loss: 0.1551 - val_loss: 0.6457 Epoch 6/12 7/7 [==============================] - 38s 5s/step - loss: 0.1443 - val_loss: 0.6469 Epoch 7/12 7/7 [==============================] - 41s 6s/step - loss: 0.1418 - val_loss: 0.6566 Epoch 8/12 7/7 [==============================] - 40s 6s/step - loss: 0.1399 - val_loss: 0.6660 Epoch 9/12 7/7 [==============================] - 45s 7s/step - loss: 0.1418 - val_loss: 0.6798 Epoch 10/12 7/7 [==============================] - 41s 6s/step - loss: 0.1410 - val_loss: 0.6909 Epoch 11/12 7/7 [==============================] - 41s 6s/step - loss: 0.1442 - val_loss: 0.7003 Epoch 12/12 7/7 [==============================] - 41s 6s/step - loss: 0.1397 - val_loss: 0.6556 WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model. finished!
[5 Points] Train two (or more) image decoders that can decode an image from two different convolutional layers from VGG (or whatever network you want). You should choose one reconstruction from an early layer (like block 1 or 2 in VGG) and from a later layer (like block 3 or later in VGG).
Image.open(requests.get("https://miro.medium.com/max/1034/1*urbx5J1WFhjA9jcYCqnIBg.png", stream=True).raw)
Since we are taking a Multi-level stylization approach for style transfer, shown in the figure above, we've choosen to train layers 2 and 3. This way we can connect them together in series to see the next level of details, rather than jumping from layers further away from eachother.
If using Keras, there is starter code available at the link above. You may use any image dataset you like for training. One nice option might be the labeled faces in the wild dataset, available in scikit-learn. This is a modest sized dataset, but is only of faces, which can help speed along training because of the reduced variability.
For the loss function, be sure to use both the L2 loss of the reconstructed image and the encoder representation (i.e., image loss and feature loss).
Note, decoding training can be sped up significantly by manipulating the above code to train multiple decoders in the same pass.
If your auto encoder fails to converge, you may use pre-trained image decoder weights (from anywhere online). Be sure to cite any code or weights you use properly. For example, you can use the pre-trained auto encoder from class:
Plot both training and validation losses versus the number of epochs to show that the training has converged.
[1 Points] Show a few images and their reconstructions using each decoder. Comment on any artifacts from the images. For full credit, the decoding of the images should look similar and the performance should be discussed.
# Plot training loss over time
#evaluate-decoder.py
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import numpy as np
#from imageio import imwrite
from keras.preprocessing import image
from PIL import Image
#from model import EncoderDecoder
if EVALUATE:
#LAYER = TARGET_LAYER
LAYER = 1
cwd = os.getcwd()
#from testing
#DECODER_PATH = f'models/'
#INPUT_IMG_PATH = 'data/RafD_test/train/Rafd045_01_Caucasian_female_angry_frontal.jpg'
#OUTPUT_IMG_PATH = f'models/Rafd045_01_Caucasian_female_angry_frontal_layer_{LAYER}.jpg'
#from maneframe
DECODER_PATH = f'models/maneframe'
INPUT_IMG_PATH = 'data/RafD_train/train/Rafd045_02_Caucasian_female_angry_frontal.jpg'
OUTPUT_IMG_PATH = f'models/Rafd045_02_Caucasian_female_angry_frontal_layer_{LAYER}.jpg'
decoder_full_path = os.path.join(cwd, DECODER_PATH)
encoder_decoder = EncoderDecoder(target_layer=LAYER, decoder_path=decoder_full_path)
input_full_path = os.path.join(cwd, INPUT_IMG_PATH)
original_images = image.load_img(input_full_path, target_size=(256, 256, 3))
original_array = image.img_to_array(original_images)
#image.array_to_img(original_array).show()
input_images = np.expand_dims(original_array, axis=0)
output_img = encoder_decoder.model.predict([input_images])
#image.array_to_img(output_img[0]).show()
output_full_path = os.path.join(cwd, OUTPUT_IMG_PATH)
image.save_img(path=output_full_path, x=output_img[0], data_format="channels_last")
#im = Image.fromarray(output_img)
#
#im.save(output_full_path)
encoder_decoder.Plot_History()
WARNING:tensorflow:No training configuration found in the save file, so the model was *not* compiled. Compile it manually.
self.inputs:
KerasTensor(type_spec=TensorSpec(shape=(None, 256, 256, 3), dtype=tf.float32, name='input_69'), name='input_69', description="created by layer 'input_69'")
Model: "model_25"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
img_input (InputLayer) [(None, 256, 256, 3)] 0
vgg19 (Functional) (None, 128, 128, 64) 39488
decoder_1 (Functional) (None, 256, 256, 3) 76355
=================================================================
Total params: 115,843
Trainable params: 115,075
Non-trainable params: 768
_________________________________________________________________
None
Image.open(input_full_path)
Image.open(output_full_path)
In the first layer, we have details such as the model's hair and facial structure. But smaller details, such as the eyes, mouth, as well as the color are not reconstructed well with the decoder. It appears that the network converges, but is not reconstructing the image entierely.
# Plot training loss over time
#evaluate-decoder.py
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import numpy as np
#from imageio import imwrite
from keras.preprocessing import image
from PIL import Image
#from model import EncoderDecoder
if EVALUATE:
#LAYER = TARGET_LAYER
LAYER = 1
cwd = os.getcwd()
#from testing
#DECODER_PATH = f'models/'
#INPUT_IMG_PATH = 'data/RafD_test/train/Rafd045_01_Caucasian_female_angry_frontal.jpg'
#OUTPUT_IMG_PATH = f'models/Rafd045_01_Caucasian_female_angry_frontal_layer_{LAYER}.jpg'
#from maneframe
DECODER_PATH = f'models/maneframe'
INPUT_IMG_PATH = 'data/RafD_train/train/Rafd090_51_Moroccan_male_happy_left.jpg'
OUTPUT_IMG_PATH = f'models/Rafd090_51_Moroccan_male_happy_left_layer_{LAYER}.jpg'
decoder_full_path = os.path.join(cwd, DECODER_PATH)
encoder_decoder = EncoderDecoder(target_layer=LAYER, decoder_path=decoder_full_path)
input_full_path = os.path.join(cwd, INPUT_IMG_PATH)
original_images = image.load_img(input_full_path, target_size=(256, 256, 3))
original_array = image.img_to_array(original_images)
#image.array_to_img(original_array).show()
input_images = np.expand_dims(original_array, axis=0)
output_img = encoder_decoder.model.predict([input_images])
#image.array_to_img(output_img[0]).show()
output_full_path = os.path.join(cwd, OUTPUT_IMG_PATH)
image.save_img(path=output_full_path, x=output_img[0], data_format="channels_last")
#im = Image.fromarray(output_img)
#
#im.save(output_full_path)
encoder_decoder.Plot_History()
WARNING:tensorflow:No training configuration found in the save file, so the model was *not* compiled. Compile it manually.
self.inputs:
KerasTensor(type_spec=TensorSpec(shape=(None, 256, 256, 3), dtype=tf.float32, name='input_71'), name='input_71', description="created by layer 'input_71'")
Model: "model_26"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
img_input (InputLayer) [(None, 256, 256, 3)] 0
vgg19 (Functional) (None, 128, 128, 64) 39488
decoder_1 (Functional) (None, 256, 256, 3) 76355
=================================================================
Total params: 115,843
Trainable params: 115,075
Non-trainable params: 768
_________________________________________________________________
None
Image.open(input_full_path)
Image.open(output_full_path)
Once again with the first layer, we have general details reconstructed, but smaller details are not reconstructed well with the decoder. The network converges, but does not reconstruct properly.
# Plot training loss over time
#evaluate-decoder.py
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import numpy as np
#from imageio import imwrite
from keras.preprocessing import image
from PIL import Image
#from model import EncoderDecoder
if EVALUATE:
#LAYER = TARGET_LAYER
LAYER = 2
cwd = os.getcwd()
#from testing
#DECODER_PATH = f'models/'
#INPUT_IMG_PATH = 'data/RafD_test/train/Rafd045_01_Caucasian_female_angry_frontal.jpg'
#OUTPUT_IMG_PATH = f'models/Rafd045_01_Caucasian_female_angry_frontal_layer_{LAYER}.jpg'
#from maneframe
DECODER_PATH = f'models/maneframe'
INPUT_IMG_PATH = 'data/RafD_train/train/Rafd045_02_Caucasian_female_angry_frontal.jpg'
OUTPUT_IMG_PATH = f'models/Rafd045_02_Caucasian_female_angry_frontal_layer_{LAYER}.jpg'
decoder_full_path = os.path.join(cwd, DECODER_PATH)
encoder_decoder = EncoderDecoder(target_layer=LAYER, decoder_path=decoder_full_path)
input_full_path = os.path.join(cwd, INPUT_IMG_PATH)
original_images = image.load_img(input_full_path, target_size=(256, 256, 3))
original_array = image.img_to_array(original_images)
#image.array_to_img(original_array).show()
input_images = np.expand_dims(original_array, axis=0)
output_img = encoder_decoder.model.predict([input_images])
#image.array_to_img(output_img[0]).show()
output_full_path = os.path.join(cwd, OUTPUT_IMG_PATH)
image.save_img(path=output_full_path, x=output_img[0], data_format="channels_last")
#im = Image.fromarray(output_img)
#
#im.save(output_full_path)
encoder_decoder.Plot_History()
WARNING:tensorflow:No training configuration found in the save file, so the model was *not* compiled. Compile it manually.
self.inputs:
KerasTensor(type_spec=TensorSpec(shape=(None, 256, 256, 3), dtype=tf.float32, name='input_77'), name='input_77', description="created by layer 'input_77'")
Model: "model_29"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
img_input (InputLayer) [(None, 256, 256, 3)] 0
vgg19 (Functional) (None, 64, 64, 128) 262464
decoder_2 (Functional) (None, 256, 256, 3) 410179
=================================================================
Total params: 672,643
Trainable params: 670,211
Non-trainable params: 2,432
_________________________________________________________________
None
Image.open(input_full_path)
Image.open(output_full_path)
As we are going further into the convolutional network, we start to loose details such as the facial features, the strans in the model's hair, even the color is washing out. This network converges much better than layer 1, and yet it's looks even worse than the layer 1 reconstructed image.
# Plot training loss over time
#evaluate-decoder.py
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import numpy as np
#from imageio import imwrite
from keras.preprocessing import image
from PIL import Image
#from model import EncoderDecoder
if EVALUATE:
#LAYER = TARGET_LAYER
LAYER = 2
cwd = os.getcwd()
#from testing
#DECODER_PATH = f'models/'
#INPUT_IMG_PATH = 'data/RafD_test/train/Rafd045_01_Caucasian_female_angry_frontal.jpg'
#OUTPUT_IMG_PATH = f'models/Rafd045_01_Caucasian_female_angry_frontal_layer_{LAYER}.jpg'
#from maneframe
DECODER_PATH = f'models/maneframe'
INPUT_IMG_PATH = 'data/RafD_train/train/Rafd090_51_Moroccan_male_happy_left.jpg'
OUTPUT_IMG_PATH = f'models/Rafd090_51_Moroccan_male_happy_left_layer_{LAYER}.jpg'
decoder_full_path = os.path.join(cwd, DECODER_PATH)
encoder_decoder = EncoderDecoder(target_layer=LAYER, decoder_path=decoder_full_path)
input_full_path = os.path.join(cwd, INPUT_IMG_PATH)
original_images = image.load_img(input_full_path, target_size=(256, 256, 3))
original_array = image.img_to_array(original_images)
#image.array_to_img(original_array).show()
input_images = np.expand_dims(original_array, axis=0)
output_img = encoder_decoder.model.predict([input_images])
#image.array_to_img(output_img[0]).show()
output_full_path = os.path.join(cwd, OUTPUT_IMG_PATH)
image.save_img(path=output_full_path, x=output_img[0], data_format="channels_last")
#im = Image.fromarray(output_img)
#
#im.save(output_full_path)
encoder_decoder.Plot_History()
WARNING:tensorflow:No training configuration found in the save file, so the model was *not* compiled. Compile it manually.
self.inputs:
KerasTensor(type_spec=TensorSpec(shape=(None, 256, 256, 3), dtype=tf.float32, name='input_87'), name='input_87', description="created by layer 'input_87'")
Model: "model_34"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
img_input (InputLayer) [(None, 256, 256, 3)] 0
vgg19 (Functional) (None, 64, 64, 128) 262464
decoder_2 (Functional) (None, 256, 256, 3) 410179
=================================================================
Total params: 672,643
Trainable params: 670,211
Non-trainable params: 2,432
_________________________________________________________________
None
Image.open(input_full_path)
Image.open(output_full_path)
As with the last example, details are being lost, and the image is much blurrier. Loss function shows there was a convergance, but reconstruction shows that it did not work correctly.
# Plot training loss over time
#evaluate-decoder.py
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import numpy as np
#from imageio import imwrite
from keras.preprocessing import image
from PIL import Image
#from model import EncoderDecoder
if EVALUATE:
#LAYER = TARGET_LAYER
LAYER = 3
cwd = os.getcwd()
#from testing
#DECODER_PATH = f'models/'
#INPUT_IMG_PATH = 'data/RafD_test/train/Rafd045_01_Caucasian_female_angry_frontal.jpg'
#OUTPUT_IMG_PATH = f'models/Rafd045_01_Caucasian_female_angry_frontal_layer_{LAYER}.jpg'
#from maneframe
DECODER_PATH = f'models/maneframe'
INPUT_IMG_PATH = 'data/RafD_train/train/Rafd045_02_Caucasian_female_angry_frontal.jpg'
OUTPUT_IMG_PATH = f'models/Rafd045_02_Caucasian_female_angry_frontal_layer_{LAYER}.jpg'
decoder_full_path = os.path.join(cwd, DECODER_PATH)
encoder_decoder = EncoderDecoder(target_layer=LAYER, decoder_path=decoder_full_path)
input_full_path = os.path.join(cwd, INPUT_IMG_PATH)
original_images = image.load_img(input_full_path, target_size=(256, 256, 3))
original_array = image.img_to_array(original_images)
#image.array_to_img(original_array).show()
input_images = np.expand_dims(original_array, axis=0)
output_img = encoder_decoder.model.predict([input_images])
#image.array_to_img(output_img[0]).show()
output_full_path = os.path.join(cwd, OUTPUT_IMG_PATH)
image.save_img(path=output_full_path, x=output_img[0], data_format="channels_last")
#im = Image.fromarray(output_img)
#
#im.save(output_full_path)
encoder_decoder.Plot_History()
WARNING:tensorflow:No training configuration found in the save file, so the model was *not* compiled. Compile it manually.
self.inputs:
KerasTensor(type_spec=TensorSpec(shape=(None, 256, 256, 3), dtype=tf.float32, name='input_93'), name='input_93', description="created by layer 'input_93'")
Model: "model_37"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
img_input (InputLayer) [(None, 256, 256, 3)] 0
vgg19 (Functional) (None, 32, 32, 256) 2332992
decoder_3 (Functional) (None, 256, 256, 3) 2923587
=================================================================
Total params: 5,256,579
Trainable params: 5,248,771
Non-trainable params: 7,808
_________________________________________________________________
None
Image.open(input_full_path)
Image.open(output_full_path)
Going further into the network, we loose all majority of details. We can still see the outline of the model's face, but color, facial features, hair line, and other elements that describe the face are lost. Convergance did not complete for this network, so more epochs could help the details.
# Plot training loss over time
#evaluate-decoder.py
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import numpy as np
#from imageio import imwrite
from keras.preprocessing import image
from PIL import Image
#from model import EncoderDecoder
if EVALUATE:
#LAYER = TARGET_LAYER
LAYER = 3
cwd = os.getcwd()
#from testing
#DECODER_PATH = f'models/'
#INPUT_IMG_PATH = 'data/RafD_test/train/Rafd045_01_Caucasian_female_angry_frontal.jpg'
#OUTPUT_IMG_PATH = f'models/Rafd045_01_Caucasian_female_angry_frontal_layer_{LAYER}.jpg'
#from maneframe
DECODER_PATH = f'models/maneframe'
INPUT_IMG_PATH = 'data/RafD_train/train/Rafd090_51_Moroccan_male_happy_left.jpg'
OUTPUT_IMG_PATH = f'models/Rafd090_51_Moroccan_male_happy_left_layer_{LAYER}.jpg'
decoder_full_path = os.path.join(cwd, DECODER_PATH)
encoder_decoder = EncoderDecoder(target_layer=LAYER, decoder_path=decoder_full_path)
input_full_path = os.path.join(cwd, INPUT_IMG_PATH)
original_images = image.load_img(input_full_path, target_size=(256, 256, 3))
original_array = image.img_to_array(original_images)
#image.array_to_img(original_array).show()
input_images = np.expand_dims(original_array, axis=0)
output_img = encoder_decoder.model.predict([input_images])
#image.array_to_img(output_img[0]).show()
output_full_path = os.path.join(cwd, OUTPUT_IMG_PATH)
image.save_img(path=output_full_path, x=output_img[0], data_format="channels_last")
#im = Image.fromarray(output_img)
#
#im.save(output_full_path)
encoder_decoder.Plot_History()
WARNING:tensorflow:No training configuration found in the save file, so the model was *not* compiled. Compile it manually.
self.inputs:
KerasTensor(type_spec=TensorSpec(shape=(None, 256, 256, 3), dtype=tf.float32, name='input_103'), name='input_103', description="created by layer 'input_103'")
Model: "model_42"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
img_input (InputLayer) [(None, 256, 256, 3)] 0
vgg19 (Functional) (None, 32, 32, 256) 2332992
decoder_3 (Functional) (None, 256, 256, 3) 2923587
=================================================================
Total params: 5,256,579
Trainable params: 5,248,771
Non-trainable params: 7,808
_________________________________________________________________
None
Image.open(input_full_path)
Image.open(output_full_path)
Once again, the details of the face are lost. We can still see a silhouette of the model, but any further facial feature is lost. More epochs for this network is necessary.
From our work of building an autoencoder with the VGG19 architecture, we could replicate the architecture and build networks that can take input an image and output a reconstructed image. However, these reconstructed images degrade as we go further into the VGG autoencoder network. Further training is needed for the algorithm to converge and rebuild properly.
Areas that improved the reconstructed images were using batch normalization layers between convolutional layers. When calculating the reconstructed image loss and activation loss, normalizing the inputs helps to remap the loss function from values in the millions to trillions to values between 0 and 1. Changing the lambda to focus more on the reconstruction image over the encoder activation helped for deeper layers. Constructing a validation set, where random images from the training set were selected, added random noise to the image, and then tested to reconstruct its original image. This last feature helped while in Jupyter notebook. However, Maneframe could not create these noisy images due to version issues. We tried updating the packages on Maneframe but could not find the correct documentation.
from PIL import Image
from skimage.transform import resize
import functools
import time
import PIL.Image
import numpy as np
import matplotlib as mpl
import matplotlib.animation as animation
import matplotlib.pyplot as plt
from IPython.display import HTML
import IPython.display as display
from pathlib import PurePath
import tensorflow as tf
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
# Could not get CUDA to work, tensorflow show ship with the CUDA dll that works for them like pytorch does
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
mpl.rcParams['figure.figsize'] = (12, 12)
mpl.rcParams['axes.grid'] = False
def pass_through(a):
return a
def tensor_to_image(tensor):
tensor = tensor*255
tensor = np.array(tensor, dtype=np.uint8)
if np.ndim(tensor) > 3:
assert tensor.shape[0] == 1
tensor = tensor[0]
return PIL.Image.fromarray(tensor)
def load_img(path_to_img, channels=3):
max_dim = 512
img = tf.io.read_file(path_to_img)
img = tf.image.decode_image(img, channels=channels)
img = tf.image.convert_image_dtype(img, tf.float32)
shape = tf.cast(tf.shape(img)[:-1], tf.float32)
long_dim = max(shape)
scale = max_dim / long_dim
new_shape = tf.cast(shape * scale, tf.int32)
img = tf.image.resize(img, new_shape)
img = img[tf.newaxis, :]
return img
def squeeze_axis(image):
if len(image.shape) > 3:
image = tf.squeeze(image, axis=0)
return image
def add_axis(image):
image = image[tf.newaxis, :]
return image
def imshow(image, title=None):
if len(image.shape) > 3:
image = tf.squeeze(image, axis=0)
plt.imshow(image)
# if title==None:
# title = str(image.shape)
# else:
# title += ' '+str(image.shape)
plt.title(title)
class VGG19AutoEncoder(tf.keras.Model):
def __init__(self, files_path):
super(VGG19AutoEncoder, self).__init__()
# Load Full Model with every trained decoder
# Get Each SubModel
# Each model has an encoder, a decoder, and an extra output convolution
# that converts the upsampled activations into output images
# DO NOT load models four and five because they are not great auto encoders
# and therefore will cause weird artifacts when used for style transfer
ModelBlock3 = tf.keras.models.load_model(
str(PurePath(files_path, 'Block3_Model')), compile=False)
self.E3 = ModelBlock3.layers[0] # VGG encoder
self.D3 = ModelBlock3.layers[1] # Trained decoder from VGG
# Conv layer to get to three channels, RGB image
self.O3 = ModelBlock3.layers[2]
ModelBlock2 = tf.keras.models.load_model(
str(PurePath(files_path, 'Block2_Model')), compile=False)
self.E2 = ModelBlock2.layers[0] # VGG encoder
self.D2 = ModelBlock2.layers[1] # Trained decoder from VGG
# Conv layer to get to three channels, RGB image
self.O2 = ModelBlock2.layers[2]
# no special decoder for this one becasue VGG first layer has
# no downsampling. So the decoder is just a convolution
ModelBlock1 = tf.keras.models.load_model(
str(PurePath(files_path, 'Block1_Model')), compile=False)
self.E1 = ModelBlock1.layers[0] # VGG encoder, one layer
# Conv layer to get to three channels, RGB image
self.O1 = ModelBlock1.layers[1]
def show_reconstructions(self, image_paths):
model_ids = [1, 2, 3]
for image_path in image_paths:
for model_id in model_ids:
model_name = f'Block{model_id}_Model'
encoder = getattr(self, f'E{model_id}')
if model_id == 1:
def decoder(a): return a # pass through
else:
decoder = getattr(self, f'D{model_id}')
final_layer = getattr(self, f'O{model_id}')
print(image_path)
plt.figure(figsize=(20, 20))
plt.subplot(1, 2, 1)
original_image = load_img(image_path)
imshow(original_image, 'Original')
plt.subplot(1, 2, 2)
encoded = encoder(tf.constant(original_image))
decoded = decoder(encoded)
reconstructed = final_layer(decoded)
imshow(reconstructed, f'Reconstructed, Model: {model_name}')
def call_style_blend(self,
content_image,
style_a_image,
style_b_image,
alpha_a,
alpha_b,
alpha_content):
model_ids = [3, 2, 1]
x = content_image
for model_id in model_ids:
encoder = getattr(self, f'E{model_id}')
if model_id == 1:
decoder = pass_through # pass through
else:
decoder = getattr(self, f'D{model_id}')
final_layer = getattr(self, f'O{model_id}')
activation_content = encoder(tf.constant(x))
activation_style_a = encoder(tf.constant(style_a_image))
activation_style_b = encoder(tf.constant(style_b_image))
blended_activations = VGG19AutoEncoder.style_blend(
activation_content,
activation_style_a,
activation_style_b,
alpha_content,
alpha_a,
alpha_b)
blended_image = final_layer(decoder(blended_activations))
blended_image = self.enhance_contrast(blended_image)
x = blended_image
blended_image = tf.clip_by_value(tf.squeeze(x), 0, 1)
return blended_image
def call_one_style(self, content_image, style_image, alpha_style=0.8):
model_ids = [3, 2, 1]
x = content_image
for model_id in model_ids:
encoder = getattr(self, f'E{model_id}')
if model_id == 1:
decoder = pass_through # pass through
else:
decoder = getattr(self, f'D{model_id}')
final_layer = getattr(self, f'O{model_id}')
activation_content = encoder(tf.constant(x))
activation_style = encoder(tf.constant(style_image))
colored_activations = VGG19AutoEncoder.wct_from_cov(
activation_content,
activation_style,
alpha_style)
colored_image = final_layer(decoder(colored_activations))
colored_image = self.enhance_contrast(colored_image)
x = colored_image
colored_image = tf.clip_by_value(tf.squeeze(x), 0, 1)
return colored_image
def call(self, image, alphas=None, training=False):
# Input should be dictionary with 'style' and 'content' keys
# {'style':style_image, 'content':content_image}
# value in each should be a 4D Tensor,: (batch, i,j, channel)
style_image = image['style']
content_image = image['content']
output_dict = dict()
# this will be the output, where each value is a styled
# version of the image at layer 1, 2, and 3. So each key in the
# dictionary corresponds to layer1, layer2, and layer3.
# we also give back the reconstructed image from the auto encoder
# so each value in the dict is a tuple (styled, reconstructed)
x = content_image
# choose covariance function
# covariance is more stable, but signal will work for very small images
wct = self.wct_from_cov
if alphas == None:
alphas = {'layer3': 0.6,
'layer2': 0.6,
'layer1': 0.6}
# ------Layer 3----------
# apply whiten/color on layer 3 from the original image
# get activations
a_c = self.E3(tf.constant(x))
a_s = self.E3(tf.constant(style_image))
# swap grammian of activations, blended with original
x = wct(a_c.numpy(), a_s.numpy(), alpha=alphas['layer3'])
# decode the new style
x = self.O3(self.D3(x))
x = self.enhance_contrast(x)
# get reconstruction
reconst3 = self.O3(self.D3(self.E3(tf.constant(content_image))))
# save off the styled and reconstructed images for display
blended3 = tf.clip_by_value(tf.squeeze(x), 0, 1)
reconst3 = tf.clip_by_value(tf.squeeze(reconst3), 0, 1)
output_dict['layer3'] = (blended3, reconst3)
# ------Layer 2----------
# apply whiten/color on layer 2 from the already blended image
# get activations
a_c = self.E2(tf.constant(x))
a_s = self.E2(tf.constant(style_image))
# swap grammian of activations, blended with original
x = wct(a_c.numpy(), a_s.numpy(), alpha=alphas['layer2'])
# decode the new style
x = self.O2(self.D2(x))
x = self.enhance_contrast(x, 1.3)
# get reconstruction
reconst2 = self.O2(self.D2(self.E2(tf.constant(content_image))))
# save off the styled and reconstructed images for display
blended2 = tf.clip_by_value(tf.squeeze(x), 0, 1)
reconst2 = tf.clip_by_value(tf.squeeze(reconst2), 0, 1)
output_dict['layer2'] = (blended2, reconst2)
# ------Layer 1----------
# apply whiten/color on layer 1 from the already blended image
# get activations
a_c = self.E1(tf.constant(x))
a_s = self.E1(tf.constant(style_image))
# swap grammian of activations, blended with original
x = wct(a_c.numpy(), a_s.numpy(), alpha=alphas['layer1'])
# decode the new style
x = self.O1(x)
x = self.enhance_contrast(x, 1.2)
# get reconstruction
reconst1 = self.O1(self.E1(tf.constant(content_image)))
# save off the styled and reconstructed images for display
blended1 = tf.clip_by_value(tf.squeeze(x), 0, 1)
reconst1 = tf.clip_by_value(tf.squeeze(reconst1), 0, 1)
output_dict['layer1'] = (blended1, reconst1)
return output_dict
@staticmethod
def enhance_contrast(image, factor=1.25):
return tf.image.adjust_contrast(image, factor)
@staticmethod
def decompose(activations):
'''
Get covariance matrix of encoded image
Decompose covariance matrix into U, sigma_diag_values
Flattened sigma makes some operations easier latter
'''
eps = 1e-5
# 1xHxWxC -> CxHxW
activations_t = np.transpose(np.squeeze(activations), (2, 0, 1))
shape_C_H_W = activations_t.shape
# CxHxW -> CxH*W
activations_flat = activations_t.reshape(
-1,
activations_t.shape[1]*activations_t.shape[2])
channel_means = activations_flat.mean(axis=1, keepdims=True)
# Zero mean
activations_flat_zero_mean = activations_flat - channel_means
covariance_mat = np.dot(activations_flat_zero_mean,
activations_flat_zero_mean.T) / \
(activations_t.shape[1]*activations_t.shape[2] - 1)
# SVD
U, Sigma, _ = np.linalg.svd(covariance_mat)
# discard small values
greater_than_eps_idxs = (Sigma > eps).sum()
sigma_diag_values = Sigma[:greater_than_eps_idxs]
U = U[:, :greater_than_eps_idxs]
return (
U,
sigma_diag_values,
activations_flat_zero_mean,
channel_means,
shape_C_H_W
)
@staticmethod
def style_blend(content,
style_a,
style_b,
alpha_content,
alpha_a,
alpha_b):
content = content.numpy()
style_a = style_a.numpy()
style_b = style_b.numpy()
(content_u,
content_sigma_diag_values,
content_activations_flat_zero_mean,
_, shape_C_H_W) = VGG19AutoEncoder.decompose(content)
content_d = np.diag(1/np.sqrt(content_sigma_diag_values))
content_whitened = (content_u @
content_d @
content_u.T) @ content_activations_flat_zero_mean
(style_a_u,
style_a_sigma_diag_values,
style_a_activations_flat_zero_mean,
style_a_channel_means, _) = VGG19AutoEncoder.decompose(style_a)
style_a_d = np.diag(np.sqrt(style_a_sigma_diag_values))
content_colored_a = (style_a_u @ style_a_d @
style_a_u.T) @ content_whitened
# add style mean back to each channel
content_colored_a = content_colored_a + style_a_channel_means
# # CxH*W -> CxHxW
content_colored_a = content_colored_a.reshape(shape_C_H_W)
# # CxHxW -> 1xHxWxC
content_colored_a = np.expand_dims(
np.transpose(content_colored_a, (1, 2, 0)), 0)
(style_b_u,
style_b_sigma_diag_values,
style_b_activations_flat_zero_mean,
style_b_channel_means, _) = VGG19AutoEncoder.decompose(style_b)
style_b_d = np.diag(np.sqrt(style_b_sigma_diag_values))
content_colored_b = (style_b_u @ style_b_d @
style_b_u.T) @ content_whitened
# add style mean back to each channel
content_colored_b = content_colored_b + style_b_channel_means
# # CxH*W -> CxHxW
content_colored_b = content_colored_b.reshape(shape_C_H_W)
# # CxHxW -> 1xHxWxC
content_colored_b = np.expand_dims(
np.transpose(content_colored_b, (1, 2, 0)), 0)
blended = alpha_a*content_colored_a + \
alpha_b * content_colored_b + \
alpha_content * content
return np.float32(blended)
@staticmethod
def wct_from_cov(content, style, alpha=0.6):
'''
https://github.com/eridgd/WCT-TF/blob/master/ops.py
Perform Whiten-Color Transform on feature maps using numpy
See p.4 of the Universal Style Transfer paper for equations:
https://arxiv.org/pdf/1705.08086.pdf
'''
(content_u,
content_sigma_diag_values,
content_activations_flat_zero_mean,
_, shape_C_H_W) = VGG19AutoEncoder.decompose(content)
content_d = np.diag(1/np.sqrt(content_sigma_diag_values))
content_whitened = (content_u @
content_d @
content_u.T) @ content_activations_flat_zero_mean
(style_u,
style_sigma_diag_values,
_, style_channel_means, _) = VGG19AutoEncoder.decompose(style)
style_d = np.diag(np.sqrt(style_sigma_diag_values))
content_colored = (style_u @ style_d @ style_u.T) @ content_whitened
# add style mean back to each channel
content_colored = content_colored + style_channel_means
# # CxH*W -> CxHxW
content_colored = content_colored.reshape(shape_C_H_W)
# # CxHxW -> 1xHxWxC
content_colored = np.expand_dims(
np.transpose(content_colored, (1, 2, 0)), 0)
blended = alpha*content_colored + (1 - alpha)*(content)
return np.float32(blended)
AE = VGG19AutoEncoder('models/vgg_decoder/')
IMAGES_TO_RECONSTRUCT = [
'images/dallas_hall.jpg',
'images/dog.jpg',
# 'images/doge.jpg',
# 'images/newton.jpg',
# 'images/python.jpg',
# 'images/anakin.png',
'images/starry_style.png',
# 'images/wave_style.png',
]
AE.show_reconstructions(IMAGES_TO_RECONSTRUCT)
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
images/dallas_hall.jpg images/dallas_hall.jpg
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
images/dallas_hall.jpg
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
images/dog.jpg images/dog.jpg
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
images/dog.jpg
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
images/starry_style.png images/starry_style.png
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
images/starry_style.png
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
content_path = 'images/dallas_hall.jpg'
style_path = 'images/Vincent_van_Gogh_Sunflowers.jpg'
content_image = load_img(content_path)
style_image = load_img(style_path)
plt.subplot(1, 2, 1)
imshow(content_image, 'Content')
plt.subplot(1, 2, 2)
imshow(style_image, 'Style')
tmp = {'style': style_image,
'content': content_image}
alphas = {'layer3': 0.8, 'layer2': 0.6, 'layer1': 0.6}
decoded_images = AE(tmp, alphas=alphas)
imshow(style_image, 'Style')
for layer in decoded_images.keys():
plt.figure(figsize=(10, 10))
plt.subplot(1, 2, 1)
imshow(decoded_images[layer][0], 'Styled')
plt.subplot(1, 2, 2)
imshow(decoded_images[layer][1], 'Reconstructed')
content_path = 'images/python.jpg'
style_path = 'images/starry_style.png'
content_image = load_img(content_path)
style_image = load_img(style_path)
plt.subplot(1, 2, 1)
imshow(content_image, 'Content')
plt.subplot(1, 2, 2)
imshow(style_image, 'Style')
tmp = {'style': style_image,
'content': content_image}
alphas = {'layer3': 0.8, 'layer2': 0.6, 'layer1': 0.6}
decoded_images = AE(tmp, alphas=alphas)
imshow(style_image, 'Style')
for layer in decoded_images.keys():
plt.figure(figsize=(10, 10))
plt.subplot(1, 2, 1)
imshow(decoded_images[layer][0], 'Styled')
plt.subplot(1, 2, 2)
imshow(decoded_images[layer][1], 'Reconstructed')
# ## Quality of the stylized images
# - I don't think these look very good. Color is transferred, and we can later see with the HSV coloring task that 'tone' or 'mood' is also transferred, but way how shapes interact with each other is not transferred.
# - The reason might be that the encode / decoders only use the shallow parts of VGG, where the lower level features reside. Thus the higher level features / styles are not transferred.
content_path = 'images/dallas_hall.jpg'
style_a_path = 'images/Vincent_van_Gogh_Sunflowers.jpg'
style_b_path = 'images/mosaic_style.png'
content_image = load_img(content_path)
style_a_image = load_img(style_a_path)
style_b_image = load_img(style_b_path)
plt.subplot(1, 3, 1)
imshow(content_image, 'Content')
plt.subplot(1, 3, 2)
imshow(style_a_image, 'Style A')
plt.subplot(1, 3, 3)
imshow(style_b_image, 'Style B')
alphas = [
{'alpha_a': 0.8, 'alpha_b': 0.0, 'alpha_content': 0.2},
{'alpha_a': 0.6, 'alpha_b': 0.2, 'alpha_content': 0.2},
{'alpha_a': 0.4, 'alpha_b': 0.4, 'alpha_content': 0.2},
{'alpha_a': 0.2, 'alpha_b': 0.6, 'alpha_content': 0.2},
{'alpha_a': 0.0, 'alpha_b': 0.8, 'alpha_content': 0.2},
]
plt.figure(figsize=(20, 20))
for i, alpha_set in enumerate(alphas):
plt.subplot(1, 5, i+1)
decoded_image = AE.call_style_blend(content_image,
style_a_image,
style_b_image,
**alpha_set)
imshow(decoded_image,
f'A:{alpha_set["alpha_a"]} B:{alpha_set["alpha_b"]}')
def to_pillow_image(image):
if len(image.shape) > 3:
image = tf.squeeze(image, axis=0)
image_np = image.numpy()*255
image_np = image_np.astype(np.uint8)
return Image.fromarray(image_np)
def masked_blend(content_path, mask_path, style_a_path, style_b_path):
content_image = load_img(content_path)
content_image_pil = to_pillow_image(content_image)
img_style_a = AE.call_one_style(content_image, style_a_image)
img_style_b = AE.call_one_style(content_image, style_b_image)
img_style_a = to_pillow_image(img_style_a)
img_style_b = to_pillow_image(img_style_b)
mask_image_pil = Image.open(mask_path).convert(
'L').resize(content_image_pil.size)
im = Image.composite(
img_style_a, img_style_b, mask_image_pil)
plt.figure(figsize=(20, 20))
plt.subplot(1, 4, 1)
imshow(style_a_image, 'Style A')
plt.subplot(1, 4, 2)
imshow(style_b_image, 'Style B')
plt.subplot(1, 4, 3)
imshow(content_image, 'Original')
plt.subplot(1, 4, 4)
plt.imshow(mask_image_pil, cmap='gray')
plt.title('Mask')
plt.figure(figsize=(20, 20))
plt.imshow(im)
plt.title('Masked Styles')
content_path = 'images/dallas_hall.jpg'
mask_path = 'images/dallas_hall_mask.jpg'
masked_blend(content_path, mask_path, style_a_path, style_b_path)
content_path = 'images/newton.jpg'
mask_path = 'images/newton_mask.jpg'
masked_blend(content_path, mask_path, style_a_path, style_b_path)
hsv_cylinder = Image.open('images/HSV.png')
plt.figure(figsize=(10, 10))
plt.imshow(hsv_cylinder)
plt.title('hsv cylinder')
Text(0.5, 1.0, 'hsv cylinder')
def color_preserving_transfer(content_path, style_path):
content_image = load_img(content_path)
style_image = load_img(style_path)
hsv_original = mpl.colors.rgb_to_hsv(squeeze_axis(content_image))
plt.figure(figsize=(20, 20))
plt.subplot(1, 6, 1)
imshow(content_image, 'original')
plt.subplot(1, 6, 2)
plt.imshow(hsv_original[:, :, 0], cmap='hsv')
plt.title('hue')
plt.subplot(1, 6, 3)
plt.imshow(hsv_original[:, :, 1])
plt.title('saturation')
plt.subplot(1, 6, 4)
plt.imshow(hsv_original[:, :, 2])
plt.title('value')
rgb = mpl.colors.hsv_to_rgb(hsv_original)
plt.subplot(1, 6, 5)
plt.imshow(rgb)
plt.title('hsv_to_rgb')
plt.subplot(1, 6, 6)
imshow(style_image)
plt.title('style image')
stylized_image = AE.call_one_style(content_image, style_image)
plt.figure(figsize=(20, 20))
plt.subplot(1, 3, 1)
imshow(stylized_image, 'stylized without color preservation')
hsv_stylized = mpl.colors.rgb_to_hsv(squeeze_axis(stylized_image))
hsv_color_restored = hsv_original
hsv_color_restored[:, :, 2] = hsv_stylized[:, :, 2]
rgb_color_restored = mpl.colors.hsv_to_rgb(hsv_original)
plt.subplot(1, 3, 2)
plt.imshow(rgb_color_restored)
plt.title('stylized with color restored')
plt.subplot(1, 3, 3)
plt.imshow(hsv_stylized[:, :, 2])
plt.title('value stylized')
content_path = 'images/newton.jpg'
style_path = 'images/Vincent_van_Gogh_Sunflowers.jpg'
color_preserving_transfer(content_path, style_path)
content_path = 'images/doge.jpg'
style_path = 'images/starry_style.png'
color_preserving_transfer(content_path, style_path)
https://commons.wikimedia.org/wiki/File:Vincent_van_Gogh_-_Sunflowers_-_VGM_F458.jpg
HSV.png